#include "ChDictionary.h"

#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>

#include "CandidateResult.h"
#include "CppJieba.h"
#include "EditDistance.h"
#include "configure.h"
#include "mylogger.h"

using std::ifstream;
using std::istringstream;
using std::vector;

ChDictionary::ChDictionary() {
    getFrequency();
    getIndex();
}

vector<pair<string, int>> ChDictionary::doQuery(const string& queryWord) {
    const vector<string>& soloChinese = cutString(queryWord);
    set<pair<string, int>> res;
    for (const string& str : soloChinese) {
        const set<int>& lineNums = _index[str];
        for (auto& line : lineNums) {
            res.insert(_freq[line - 1]);
        }
    }
    return vector<pair<string, int>>(res.begin(), res.end());
}

void ChDictionary::getFrequency() {
    string ChdictPath =
        Configuration::getInstance()->getConfigMap()["CN_DICT_OUTPUT_PATH"];
    ifstream freqStream(ChdictPath);
    if (!freqStream.is_open()) {
        LogError("Open Chinese frequency file failed");
        abort();
    }
    string line, word;
    int freq;
    while (getline(freqStream, line)) {
        istringstream iss(line);
        if (iss >> word >> freq) {
            _freq.emplace_back(word, freq);
        }
    }
}

void ChDictionary::getIndex() {
    string ChIndexPath =
        Configuration::getInstance()->getConfigMap()["CN_INDEX_OUTPUT_PATH"];
    ifstream indexStream(ChIndexPath);
    if (!indexStream.is_open()) {
        LogError("Open Chinese index file failed");
        abort();
    }
    string line, word;
    while (getline(indexStream, line)) {
        istringstream iss(line);
        iss >> word;
        int lineNum;
        set<int> wordLineNums;
        while (iss >> lineNum) {
            wordLineNums.insert(lineNum);
        }
        _index.insert(std::make_pair(word, std::move(wordLineNums)));
    }
}

vector<string> ChDictionary::cutString(const string& chstr) {
    set<string> output;
    string ch;
    int len;
    for (size_t i = 0; i != chstr.size(); i += len) {
        unsigned char byte = (unsigned)chstr[i];
        if (byte >= 0xFC) {
            len = 6;
        } else if (byte >= 0xF8) {
            len = 5;
        } else if (byte >= 0xF0) {
            len = 4;
        } else if (byte >= 0xE0) {
            len = 3;
        } else if (byte >= 0xc0) {
            len = 2;
        } else {
            len = 1;
        }
        ch = chstr.substr(i, len);
        output.insert(ch);
    }
    return vector<string>(output.begin(), output.end());
}
